案例背景: 同学们进入大学后,经过一年的学习和生活,对于未来的规划更为清晰,同时对所学专业有了一定认识,自然有部分同学对所学专业不够喜欢,有转专业的需求。转专业原因可能各不相同:高考发挥失利没能进入自己心仪专业,进入了心仪专业却发现根本不是那么回事,或者发现其他专业就业及薪金待遇更好,于是变心了……希望基于我校前几年转专业申请数据,分析校内专业热度排名。
南开大学转专业工作一般于每年四月启动,原则上面向大一和大二年级学生。学生可根据各接收学院公布的接收专业名额和条件提出转专业申请,经所在学院备案,接收学院考核择优选拔后,报教务处核准执行。学生考虑转专业的原因很多,有在大学中认清学习方向,有的是录取时候被调剂,有的是受其他人的影响。学生转专业的去向一般是热门专业和高薪专业,或者是对某个专业的热爱向往,导致了转专业的过程。
基于以上分析,从转专业申请数据一定程度上能够看出某个专业的热门程度和洞察转专业制度上的潜在规律,然后进行逐年的分析,然后获得转专业变化情况,对实际的学习生活进行更好的指导。
通过三年的转专业数据,本词案例构建的网络有两个层级,专业与专业之间的网络,学院与学院之间的网络,分别把转专业学生的专业和专业所在学院看做图的顶点,如果是相同专业或学院的则连成一条边。建立专业与专业之间的关系图,并分析转专业学生的转入院和转出院之间的关系。从而构建邻接矩阵,由初始值出发,使用Pagerank算法进行迭代:
输入: 含有 个结点的有向图,转移矩阵 阻尼因子 , 初始向量 输出: 有向图的 PageRank 向量
(1)令
(2) 计算
(3) 如果 与 充分接近,令 , 停止迭代。
(4) 否则, 执行步 (2)。
从而进行预测和专业热度的排序,最后引入了一些其他度量网络中节点Centrality的方法。
数据为2018,2019,2020年申请转专业的数据,数据已经进行脱敏处理,只涉及到转入转出的专业学院,不涉及到个人信息,展示如下:
本作业的学习目标包括:
# 读入包
library("ggplot2")
library("stringr")
library("reshape2")
library("igraph")
library("dplyr")
# 读取数据
data18=read.csv('2018.csv',fileEncoding="UTF-8-BOM")
data19=read.csv('2019.csv',fileEncoding="UTF-8-BOM")
data20=read.csv('2020.csv',fileEncoding="UTF-8-BOM")
# 整理出转入专业数据
data1=table(data18$转入专业)
data1=cbind(row.names(data1),data1)
colnames(data1) = c("转入专业", "人数")
data1=data.frame(data1)
data1$人数=as.numeric(data1$人数)
data2=table(data19$转入专业)
data2=cbind(row.names(data2),data2)
colnames(data2) = c("转入专业", "人数")
data2=data.frame(data2)
data2$人数=as.numeric(data2$人数)
data3=table(data20$转入专业)
data3=cbind(row.names(data3),data3)
colnames(data3) = c("转入专业", "人数")
data3=data.frame(data3)
data3$人数=as.numeric(data3$人数)
# 将数据转换为频率分布
total1=sum(data1$人数)
total2=sum(data2$人数)
total3=sum(data3$人数)
data1$time=2018
data1$freq=data1$人数/total1
data2$time=2019
data2$freq=data2$人数/total2
data3$time=2020
data3$freq=data3$人数/total3
data123=rbind(data1,data2,data3)
# 画图
options(repr.plot.width = 20, repr.plot.height = 8)
pl1 <- ggplot(data=data1, aes(x=转入专业, y=人数,fill=转入专业), fill=转入专业) +
geom_bar(stat="identity",width = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5,colour = "darkred", size = rel(1.6)))
pl2 <- ggplot(data=data2, aes(x=转入专业, y=人数,fill=转入专业), fill=转入专业) +
geom_bar(stat="identity",width = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5,colour = "darkred", size = rel(1.6)))
pl3 <- ggplot(data=data3, aes(x=转入专业, y=人数,fill=转入专业), fill=转入专业) +
geom_bar(stat="identity",width = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5,colour = "darkred", size = rel(1.6)))
pl1;pl2;pl3
options(repr.plot.width = 20, repr.plot.height = 30)
ggplot(data = data123, aes(x = time, y = freq)) + geom_line() +
facet_wrap(~转入专业)
# 整理出转出专业数据
data4=table(data18$所在专业)
data4=cbind(row.names(data4),data4)
colnames(data4) = c("所在专业", "人数")
data4=data.frame(data4)
data4$人数=as.numeric(data4$人数)
data5=table(data19$所在专业)
data5=cbind(row.names(data5),data5)
colnames(data5) = c("所在专业", "人数")
data5=data.frame(data5)
data5$人数=as.numeric(data5$人数)
data6=table(data20$所在专业)
data6=cbind(row.names(data6),data6)
colnames(data6) = c("所在专业", "人数")
data6=data.frame(data6)
data6$人数=as.numeric(data6$人数)
# 将数据转换为频率分布
total4=sum(data4$人数)
total5=sum(data5$人数)
total6=sum(data6$人数)
data4$time=2018
data4$freq=data4$人数/total4
data5$time=2019
data5$freq=data5$人数/total5
data6$time=2020
data6$freq=data6$人数/total6
data456=rbind(data4,data5,data6)
# 画图
options(repr.plot.width = 20, repr.plot.height = 8)
pl4 <- ggplot(data=data4, aes(x=所在专业, y=人数,fill=所在专业), fill=所在专业) +
geom_bar(stat="identity",width = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5,colour = "darkred", size = rel(1.6)))
pl5 <- ggplot(data=data5, aes(x=所在专业, y=人数,fill=所在专业), fill=所在专业) +
geom_bar(stat="identity",width = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5,colour = "darkred", size = rel(1.6)))
pl6 <- ggplot(data=data6, aes(x=所在专业, y=人数,fill=所在专业), fill=所在专业) +
geom_bar(stat="identity",width = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5,colour = "darkred", size = rel(1.6)))
pl4;pl5;pl6
options(repr.plot.width = 20, repr.plot.height = 30)
ggplot(data = data456, aes(x = time, y = freq)) + geom_line() +
facet_wrap(~所在专业)
# 得到总的转专业数据
datatotal=rbind(data18,data19,data20)
# 总体数据的预处理与转换
data7=table(datatotal$转入专业)
data7=cbind(row.names(data7),data7)
colnames(data7) = c("转入专业", "人数")
data7=data.frame(data7)
data7$人数=as.numeric(data7$人数)
data8=table(datatotal$所在专业)
data8=cbind(row.names(data8),data8)
colnames(data8) = c("所在专业", "人数")
data8=data.frame(data8)
data8$人数=as.numeric(data8$人数)
# 画图
options(repr.plot.width = 20, repr.plot.height = 30)
ggplot(data = data123, aes(x = time, y = freq)) + geom_line() +
facet_wrap(~转入专业)
options(repr.plot.width = 25, repr.plot.height = 10)
pl7 <- ggplot(data=data7, aes(x=转入专业, y=人数,fill=转入专业), fill=转入专业) +
geom_bar(stat="identity",width = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5,colour = "darkred", size = rel(1.6)))
pl8 <- ggplot(data=data8, aes(x=所在专业, y=人数,fill=所在专业), fill=所在专业) +
geom_bar(stat="identity",width = 1)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5,colour = "darkred", size = rel(1.6)))
pl7;pl8
write.csv(data.frame(row.names(data7)),file = "in.csv",row.names = F)
write.csv(data.frame(row.names(data8)),file = "out.csv",row.names = F)
采集各专业方向学生薪金待遇,考察转出专业与转入专业薪金差距。
list1=data8[order(data8[,2],decreasing=T),][1:10,]$所在专业
list2=data7[order(data7[,2],decreasing=T),][1:10,]$转入专业
income1=read.csv("in.csv",fileEncoding="UTF-8")
income2=read.csv("out.csv",fileEncoding="UTF-8")
income1$工资=as.numeric(income1$工资)
income2$工资=as.numeric(income2$工资)
#转出排行专业工资平均
mean(income2[which(income2$专业 %in% list1), ][,2])
#转入排行专业工资平均
mean(income1[which(income1$专业 %in% list2), ][,2])
利用PageRank对专业进行排名(分文理?)
按年做分析,看是否有变化。最后也可合并在一起。
提示:
options(repr.plot.width = 20, repr.plot.height = 20)
relations=data.frame(from=data18$所在院系,to=data18$转入院系)
g <- graph_from_data_frame(relations, directed=TRUE)
com = walktrap.community(g, steps = 6)
V(g)$sg = com$membership + 1
V(g)$color = rainbow(max(V(g)$sg))[V(g)$sg]
plot(g, layout=layout.fruchterman.reingold,edge.color = grey(0.4),edge.arrow.mode = ">",
vertex.size =4,vertex.shape='circle',vertex.label.cex=1.0,vertex.label.color='black',
edge.arrow.size=0.1, vertex.color = V(g)$color)
options(repr.plot.width = 20, repr.plot.height = 20)
relations=data.frame(from=data18$所在专业,to=data18$转入专业)
g <- graph_from_data_frame(relations, directed=TRUE)
com = walktrap.community(g, steps = 6)
V(g)$sg = com$membership + 1
V(g)$color = rainbow(max(V(g)$sg))[V(g)$sg]
plot(g, layout=layout.fruchterman.reingold,edge.color = grey(0.4),edge.arrow.mode = ">",
vertex.size =4,vertex.shape='circle',vertex.label.cex=1.0,vertex.label.color='black',
edge.arrow.size=0.1, vertex.color = V(g)$color)
pr1 <- page_rank(g, algo = c("prpack", "arpack", "power"), vids = V(g),
directed = TRUE, damping = 0.85, personalized = NULL, weights = NULL,
options = NULL)
pr1$vector[order(pr1$vector,decreasing=T)]
options(repr.plot.width = 20, repr.plot.height = 20)
relations=data.frame(from=data19$所在院系,to=data19$转入院系)
g <- graph_from_data_frame(relations, directed=TRUE)
com = walktrap.community(g, steps = 6)
V(g)$sg = com$membership + 1
V(g)$color = rainbow(max(V(g)$sg))[V(g)$sg]
plot(g, layout=layout.fruchterman.reingold,edge.color = grey(0.4),edge.arrow.mode = ">",
vertex.size =4,vertex.shape='circle',vertex.label.cex=1.0,vertex.label.color='black',
edge.arrow.size=0.1, vertex.color = V(g)$color)
options(repr.plot.width = 20, repr.plot.height = 20)
relations=data.frame(from=data19$所在专业,to=data19$转入专业)
g <- graph_from_data_frame(relations, directed=TRUE)
com = walktrap.community(g, steps = 6)
V(g)$sg = com$membership + 1
V(g)$color = rainbow(max(V(g)$sg))[V(g)$sg]
plot(g, layout=layout.fruchterman.reingold,edge.color = grey(0.4),edge.arrow.mode = ">",
vertex.size =4,vertex.shape='circle',vertex.label.cex=1.0,vertex.label.color='black',
edge.arrow.size=0.1, vertex.color = V(g)$color)
pr2 <- page_rank(g, algo = c("prpack", "arpack", "power"), vids = V(g),
directed = TRUE, damping = 0.85, personalized = NULL, weights = NULL,
options = NULL)
pr2$vector[order(pr2$vector,decreasing=T)]
options(repr.plot.width = 20, repr.plot.height = 20)
relations=data.frame(from=data20$所在院系,to=data20$转入院系)
g <- graph_from_data_frame(relations, directed=TRUE)
com = walktrap.community(g, steps = 6)
V(g)$sg = com$membership + 1
V(g)$color = rainbow(max(V(g)$sg))[V(g)$sg]
plot(g, layout=layout.fruchterman.reingold,edge.color = grey(0.4),edge.arrow.mode = ">",
vertex.size =4,vertex.shape='circle',vertex.label.cex=1.0,vertex.label.color='black',
edge.arrow.size=0.1, vertex.color = V(g)$color)
options(repr.plot.width = 20, repr.plot.height = 20)
relations=data.frame(from=data20$所在专业,to=data20$转入专业)
g <- graph_from_data_frame(relations, directed=TRUE)
com = walktrap.community(g, steps = 6)
V(g)$sg = com$membership + 1
V(g)$color = rainbow(max(V(g)$sg))[V(g)$sg]
plot(g, layout=layout.fruchterman.reingold,edge.color = grey(0.4),edge.arrow.mode = ">",
vertex.size =4,vertex.shape='circle',vertex.label.cex=1.0,vertex.label.color='black',
edge.arrow.size=0.1, vertex.color = V(g)$color)
pr3 <- page_rank(g, algo = c("prpack", "arpack", "power"), vids = V(g),
directed = TRUE, damping = 0.85, personalized = NULL, weights = NULL,
options = NULL)
pr3$vector[order(pr3$vector,decreasing=T)]
rank18=data.frame(pr1$vector)
rank19=data.frame(pr2$vector)
rank20=data.frame(pr3$vector)
rank18=cbind(row.names(rank18),rank18);colnames(rank18)=c('major','rank');rank18$year=2018
rank19=cbind(row.names(rank19),rank19);colnames(rank19)=c('major','rank');rank19$year=2019
rank20=cbind(row.names(rank20),rank20);colnames(rank20)=c('major','rank');rank20$year=2020
rankchange=rbind(rank18,rank19,rank20)
library(ggplot2)
options(repr.plot.width = 20, repr.plot.height = 30)
ggplot(data = rankchange, aes(x = year, y = rank)) + geom_line() +
facet_wrap(~major)
data=rbind(data18,data19,data20)
options(repr.plot.width = 20, repr.plot.height = 20)
relations=data.frame(from=data$所在专业,to=data$转入专业)
g1 <- graph_from_data_frame(relations, directed=TRUE)
com = walktrap.community(g1, steps = 6)
V(g1)$sg = com$membership + 1
V(g1)$color = rainbow(max(V(g1)$sg))[V(g1)$sg]
plot(g1, layout=layout.fruchterman.reingold,edge.color = grey(0.4),edge.arrow.mode = ">",
vertex.size =4,vertex.shape='circle',vertex.label.cex=1.0,vertex.label.color='black',
edge.arrow.size=0.1, vertex.color = V(g1)$color)
pr4 <- page_rank(g1, algo = c("prpack", "arpack", "power"), vids = V(g1),
directed = TRUE, damping = 0.85, personalized = NULL, weights = NULL,
options = NULL)
pr4$vector[order(pr4$vector,decreasing=T)]
list3=c('汉语言文化学院','马克思主义学院', '旅游与服务学院' ,'周恩来政府管理学院', '历史学院','法学院','经济学院','文学院','哲学院','外国语学院' )
list4=c('化学学院','商学院','物理科学学院','医学院','金融学院','生命科学学院','数学科学学院','药学院','统计与数据科学学院')
list5=c('计算机学院' ,'软件学院','网络空间安全学院' ,'人工智能学院','材料科学与工程学院','计算机与控制工程学院','电子信息与光学工程学院','环境科学与工程学院')
# 按类别划分数据
lit=data[which(data$所在院系 %in% list3), ]
sci=data[which(data$所在院系 %in% list4), ]
eng=data[which(data$所在院系 %in% list5), ]
relations=data.frame(from=lit$所在专业,to=lit$转入专业)
g <- graph_from_data_frame(relations, directed=TRUE)
pr5 <- page_rank(g, algo = c("prpack", "arpack", "power"), vids = V(g),
directed = TRUE, damping = 0.85, personalized = NULL, weights = NULL,
options = NULL)
pr5$vector[order(pr5$vector,decreasing=T)][1:5]
relations=data.frame(from=sci$所在专业,to=sci$转入专业)
g <- graph_from_data_frame(relations, directed=TRUE)
pr6 <- page_rank(g, algo = c("prpack", "arpack", "power"), vids = V(g),
directed = TRUE, damping = 0.85, personalized = NULL, weights = NULL,
options = NULL)
pr6$vector[order(pr6$vector,decreasing=T)][1:5]
relations=data.frame(from=eng$所在专业,to=eng$转入专业)
g <- graph_from_data_frame(relations, directed=TRUE)
pr7 <- page_rank(g, algo = c("prpack", "arpack", "power"), vids = V(g),
directed = TRUE, damping = 0.85, personalized = NULL, weights = NULL,
options = NULL)
pr7$vector[order(pr7$vector,decreasing=T)][1:5]
由上面的得到三个不同方向的转专业学生心中的热门专业:
degree=degree(g1, mode="all")
degree[order(degree,decreasing=T)][1:5]
度排名前五的为:经济学类,化学类,法学,物理学类,旅游管理类。
closeness=closeness(g1, mode="all", weights=NA, normalized=T)
closeness[order(closeness,decreasing=T)][1:6]
betweenness=betweenness(g1, directed=F, weights=NA, normalized = T)
betweenness[order(betweenness,decreasing=T)][1:5]
由于转专业,不管转出,转入也好,经济学类,物理学类,法学,化学类和计算机科学与技术是人员流动比较大的专业。